This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
rm(list = ls(all=TRUE)) # clear global environment
library(tidyr) # data loading
library(dplyr) # dataframe manipulation
library(stringr) # string manipulation
library(ggplot2) # plotting
path <- file.path("C:","Users","Paulina-laptop","Desktop","R_Programming","RLadies_workshop")
setwd(path)
sprintf("Current directory: %s", getwd())
## [1] "Current directory: C:/Users/Paulina-laptop/Desktop/R_Programming/RLadies_workshop"
Let’s load some data first
wells <- read.csv(unzip("BCOGC_data/prod_csv.zip", "zone_prd_2016_to_present.csv"), skip = 1)
head(wells)
## Wa_num Compltn_event_seq Prod_period UWI Area_code Formtn_code
## 1 29 0 201601 100142108318W600 3600 6200
## 2 29 0 201603 100142108318W600 3600 6200
## 3 29 0 201611 100142108318W600 3600 6200
## 4 29 0 201612 100142108318W600 3600 6200
## 5 29 0 201701 100142108318W600 3600 6200
## 6 29 0 201702 100142108318W600 3600 6200
## Pool_seq Gas_prod_vol..e3m3. Oil_prod_vol..m3. Water_prod_vol..m3.
## 1 A 107.7 0 2.3
## 2 A 45.2 0 1.7
## 3 A 166.2 0 5.0
## 4 A 159.2 0 2.6
## 5 A 133.5 0 3.7
## 6 A 95.8 0 4.1
## Cond_prod_vol..m3. Prod_days. Gas_prod_cum..e3m3. Oil_prod_cum..m3.
## 1 0.5 31.0 107.7 0
## 2 0.3 13.0 152.9 0
## 3 1.3 22.0 319.1 0
## 4 0.5 31.0 478.3 0
## 5 0.6 31.0 611.8 0
## 6 0.5 27.8 707.6 0
## Water_prod_cum..m3. Cond_prod_cum..m3. Project_code
## 1 2.3 0.5 0
## 2 4.0 0.8 0
## 3 9.0 2.1 0
## 4 11.6 2.6 0
## 5 15.3 3.2 0
## 6 19.4 3.7 0
Column names in the original file have inconvenient interpunction, which can be easily fixed:
names(wells) <- str_replace_all(names(wells), c(" " = "" ,
"," = "",
"\\("="_",
"\\)"="",
"\\.."="_",
"\\."=""
))
names(wells)
## [1] "Wa_num" "Compltn_event_seq" "Prod_period"
## [4] "UWI" "Area_code" "Formtn_code"
## [7] "Pool_seq" "Gas_prod_vol_e3m3" "Oil_prod_vol_m3"
## [10] "Water_prod_vol_m3" "Cond_prod_vol_m3" "Prod_days"
## [13] "Gas_prod_cum_e3m3" "Oil_prod_cum_m3" "Water_prod_cum_m3"
## [16] "Cond_prod_cum_m3" "Project_code"
wells <- subset(wells, select=c(Wa_num, Prod_period, Prod_days, Area_code, Formtn_code, Gas_prod_vol_e3m3, Oil_prod_vol_m3, Water_prod_vol_m3, Cond_prod_vol_m3)) # select only useful columns
area_codes <- read.table("BCOGC_data/ogc_area_codes.csv",
sep = ",", skip = 1, header = TRUE, stringsAsFactors = FALSE)
head(area_codes, 3)
## Area.Code Area.Name
## 1 50 ADSETT
## 2 100 AIRPORT
## 3 200 AITKEN CREEK
# rename so that we have the same name of Area_code and Area_name in all columns
area_codes <- rename(area_codes, Area_code = Area.Code, Area_name = Area.Name)
formation_codes <- read.table("BCOGC_data/ogc_formation_codes.csv",
sep = ",", skip = 1, header = TRUE, stringsAsFactors = FALSE)
head(formation_codes, 3)
## Formation.Code Formation.Name
## 1 4610 A MARKER/BASE OF LIME
## 2 2703 AITKEN CREEK
## 3 9922 ALDRIDGE
formation_codes <- rename(formation_codes, Formtn_code = Formation.Code, Formtn_name = Formation.Name)
Merge dataframes to create 2 additional columns with area and name as the wells file contains only its numerical codes.
wells <- merge(wells, area_codes, by = 'Area_code')
wells <- merge(wells, formation_codes, by = 'Formtn_code')
# we don't need them anymore in memory and dataframe
rm(area_codes, formation_codes) # remove form memory
wells <- subset(wells, select = -c(Area_code, Formtn_code)) # remove from dataframe
First glimpse on the data. Notice newly created columns, Area_name and Formtn_name at the end.
names(wells)
## [1] "Wa_num" "Prod_period" "Prod_days"
## [4] "Gas_prod_vol_e3m3" "Oil_prod_vol_m3" "Water_prod_vol_m3"
## [7] "Cond_prod_vol_m3" "Area_name" "Formtn_name"
str(wells)
## 'data.frame': 517518 obs. of 9 variables:
## $ Wa_num : int 14893 25370 25370 26962 14893 14893 25371 25371 25371 26962 ...
## $ Prod_period : int 201809 202003 201901 201709 201805 202001 202011 201912 201805 201907 ...
## $ Prod_days : num 30 30.7 30.8 23 31 31 28.5 30.4 30.4 28 ...
## $ Gas_prod_vol_e3m3: num 0 0 0 0 0 0 0 0 0 0 ...
## $ Oil_prod_vol_m3 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Water_prod_vol_m3: num 91.4 1291 2528 1068 6948.7 ...
## $ Cond_prod_vol_m3 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Area_name : chr "DESAN" "PEEJAY WEST" "PEEJAY WEST" "BEAVERTAIL" ...
## $ Formtn_name : chr "QUATERNARY" "QUATERNARY" "QUATERNARY" "QUATERNARY" ...
summary(wells)
## Wa_num Prod_period Prod_days Gas_prod_vol_e3m3
## Min. : 29 Min. :201601 Min. : 0.00 Min. : 0.0
## 1st Qu.:15183 1st Qu.:201704 1st Qu.:25.60 1st Qu.: 38.2
## Median :22328 Median :201807 Median :30.00 Median : 124.8
## Mean :21074 Mean :201810 Mean :26.08 Mean : 555.4
## 3rd Qu.:28257 3rd Qu.:201910 3rd Qu.:31.00 3rd Qu.: 608.5
## Max. :41055 Max. :202101 Max. :31.10 Max. :119042.3
## Oil_prod_vol_m3 Water_prod_vol_m3 Cond_prod_vol_m3 Area_name
## Min. : 0.00 Min. : 0.0 Min. : 0.00 Length:517518
## 1st Qu.: 0.00 1st Qu.: 0.9 1st Qu.: 0.00 Class :character
## Median : 0.00 Median : 5.0 Median : 0.00 Mode :character
## Mean : 10.77 Mean : 181.4 Mean : 19.31
## 3rd Qu.: 0.00 3rd Qu.: 52.6 3rd Qu.: 1.00
## Max. :7089.30 Max. :29177.7 Max. :7424.00
## Formtn_name
## Length:517518
## Class :character
## Mode :character
##
##
##
# Let's check number of formations in the file...
length(unique(wells$Formtn_name))
## [1] 92
# ...and list them
unique(wells$Formtn_name)
## [1] "QUATERNARY" "CRETACEOUS"
## [3] "CARDIUM SAND" "DOE CREEK"
## [5] "DUNVEGAN" "BASE OF FISH SCALES"
## [7] "SCATTER" "PADDY"
## [9] "PADDY- CADOTTE" "CADOTTE"
## [11] "SPIRIT RIVER" "NOTIKEWIN"
## [13] "FALHER" "FALHER A"
## [15] "FALHER B" "FALHER C"
## [17] "FALHER D" "WILRICH"
## [19] "BLUESKY" "BASAL BLUESKY"
## [21] "BLUESKY-GETHING" "BLUESKY-GETHING-DETRITAL"
## [23] "DETRITAL" "GETHING"
## [25] "LOWER GETHING" "BASAL GETHING"
## [27] "GETHING-BALDONNEL" "CADOMIN"
## [29] "CHINKEH" "NIKANASSIN"
## [31] "DUNLEVY" "LOWER DUNLEVY"
## [33] "ROCK CREEK" "NORDEGG"
## [35] "NORDEGG-BALDONNEL" "PARDONET-BALDONNEL"
## [37] "BALDONNEL" "BALDONNEL/UPPER CHARLIE LAKE"
## [39] "CHARLIE LAKE" "SIPHON"
## [41] "CECIL" "NANCY"
## [43] "FLATROCK" "BOUNDARY LAKE"
## [45] "YELLOW MARKER" "COPLIN"
## [47] "MICA" "BLUEBERRY"
## [49] "INGA" "NORTH PINE"
## [51] "BEAR FLAT" "PINGEL"
## [53] "LIMESTONE A BED" "A MARKER/BASE OF LIME"
## [55] "LOWER CHARLIE LAKE SANDS" "ARTEX"
## [57] "ARTEX/HALFWAY" "UPPER HALFWAY"
## [59] "HALFWAY" "LOWER HALFWAY"
## [61] "DOIG" "DOIG PHOSPHATE BEDS"
## [63] "BLUESKY-GETHING-MONTNEY" "LOWER CHARLIE LAKE/MONTNEY"
## [65] "DOIG PHOSPHATE-MONTNEY" "MONTNEY"
## [67] "BELLOY" "BELCOURT"
## [69] "BELCOURT-TAYLOR FLAT" "BELLOY-KISKATINAW"
## [71] "TAYLOR FLAT" "MISSISSIPPIAN"
## [73] "KISKATINAW" "LOWER KISKATINAW"
## [75] "BASAL KISKATINAW" "UPPER DEBOLT"
## [77] "DEBOLT" "ELKTON"
## [79] "SHUNDA" "PEKISKO"
## [81] "BANFF" "BESA RIVER"
## [83] "WABAMUN" "TETCHO"
## [85] "KAKISA" "JEAN MARIE"
## [87] "MUSKWA" "MUSKWA-OTTER PARK"
## [89] "SLAVE POINT" "SULPHUR POINT"
## [91] "EVIE" "PINE POINT"
slave_pnt <- filter(wells, Formtn_name == "SLAVE POINT" )
unique(slave_pnt$Formtn_name)
## [1] "SLAVE POINT"
muskwa <- filter(wells, Formtn_name %in% c("MUSKWA", "MUSKWA-OTTER PARK"))
unique(muskwa$Formtn_name)
## [1] "MUSKWA" "MUSKWA-OTTER PARK"
# same as
muskwa <- filter(wells, Formtn_name == "MUSKWA" | Formtn_name == "MUSKWA-OTTER PARK")
unique(muskwa$Formtn_name)
## [1] "MUSKWA" "MUSKWA-OTTER PARK"
Wa_num_sorted <- arrange(wells, Wa_num) # ascending
head(Wa_num_sorted)
## Wa_num Prod_period Prod_days Gas_prod_vol_e3m3 Oil_prod_vol_m3
## 1 29 201703 30.0 73.0 0
## 2 29 201704 24.0 64.7 0
## 3 29 201706 3.0 7.2 0
## 4 29 201707 15.1 53.8 0
## 5 29 201611 22.0 166.2 0
## 6 29 201601 31.0 107.7 0
## Water_prod_vol_m3 Cond_prod_vol_m3 Area_name Formtn_name
## 1 2.8 0.3 FORT ST JOHN BELLOY
## 2 1.9 0.3 FORT ST JOHN BELLOY
## 3 0.3 0.1 FORT ST JOHN BELLOY
## 4 2.0 0.5 FORT ST JOHN BELLOY
## 5 5.0 1.3 FORT ST JOHN BELLOY
## 6 2.3 0.5 FORT ST JOHN BELLOY
Wa_num_sorted <- arrange(wells, desc(Wa_num)) # descending
head(Wa_num_sorted)
## Wa_num Prod_period Prod_days Gas_prod_vol_e3m3 Oil_prod_vol_m3
## 1 41055 202012 30.6 15051.0 0
## 2 41055 202101 30.8 12528.5 0
## 3 41055 202011 5.5 1293.3 0
## 4 41054 202011 5.6 1658.9 0
## 5 41054 202012 30.8 19143.3 0
## 6 41054 202101 30.8 17839.7 0
## Water_prod_vol_m3 Cond_prod_vol_m3 Area_name Formtn_name
## 1 3630.5 950.8 HERITAGE MONTNEY
## 2 2087.3 770.4 HERITAGE MONTNEY
## 3 3396.8 110.8 HERITAGE MONTNEY
## 4 3444.7 317.9 HERITAGE MONTNEY
## 5 4204.7 1691.9 HERITAGE MONTNEY
## 6 2476.8 1170.2 HERITAGE MONTNEY
Wa_num_sorted <- arrange(wells, desc(Wa_num), Prod_period) # by multiple conditions
head(Wa_num_sorted)
## Wa_num Prod_period Prod_days Gas_prod_vol_e3m3 Oil_prod_vol_m3
## 1 41055 202011 5.5 1293.3 0
## 2 41055 202012 30.6 15051.0 0
## 3 41055 202101 30.8 12528.5 0
## 4 41054 202011 5.6 1658.9 0
## 5 41054 202012 30.8 19143.3 0
## 6 41054 202101 30.8 17839.7 0
## Water_prod_vol_m3 Cond_prod_vol_m3 Area_name Formtn_name
## 1 3396.8 110.8 HERITAGE MONTNEY
## 2 3630.5 950.8 HERITAGE MONTNEY
## 3 2087.3 770.4 HERITAGE MONTNEY
## 4 3444.7 317.9 HERITAGE MONTNEY
## 5 4204.7 1691.9 HERITAGE MONTNEY
## 6 2476.8 1170.2 HERITAGE MONTNEY
rm(Wa_num_sorted)
names(wells)
## [1] "Wa_num" "Prod_period" "Prod_days"
## [4] "Gas_prod_vol_e3m3" "Oil_prod_vol_m3" "Water_prod_vol_m3"
## [7] "Cond_prod_vol_m3" "Area_name" "Formtn_name"
head(select(wells, Area_name, Formtn_name))
## Area_name Formtn_name
## 1 DESAN QUATERNARY
## 2 PEEJAY WEST QUATERNARY
## 3 PEEJAY WEST QUATERNARY
## 4 BEAVERTAIL QUATERNARY
## 5 DESAN QUATERNARY
## 6 DESAN QUATERNARY
# instead of listing columns separately, we can list multiple columns next to each other using ":"
head(select(wells, Gas_prod_vol_e3m3:Cond_prod_vol_m3, Area_name, Formtn_name))
## Gas_prod_vol_e3m3 Oil_prod_vol_m3 Water_prod_vol_m3 Cond_prod_vol_m3
## 1 0 0 91.4 0
## 2 0 0 1291.0 0
## 3 0 0 2528.0 0
## 4 0 0 1068.0 0
## 5 0 0 6948.7 0
## 6 0 0 109.3 0
## Area_name Formtn_name
## 1 DESAN QUATERNARY
## 2 PEEJAY WEST QUATERNARY
## 3 PEEJAY WEST QUATERNARY
## 4 BEAVERTAIL QUATERNARY
## 5 DESAN QUATERNARY
## 6 DESAN QUATERNARY
The same wells have been listed multiple times as there are various production periods each year. Let’s create new column Prod_year to make more general summary.
head(wells$Prod_period)
## [1] 201809 202003 201901 201709 201805 202001
wells <- wells %>%
mutate(Prod_year = substr(Prod_period, 1, 4),
Prod_month = substr(Prod_period, 5, 6),
Prod_ym = paste(Prod_year, Prod_month, sep="-")) #to create time "points" only from available dates
head(wells)
## Wa_num Prod_period Prod_days Gas_prod_vol_e3m3 Oil_prod_vol_m3
## 1 14893 201809 30.0 0 0
## 2 25370 202003 30.7 0 0
## 3 25370 201901 30.8 0 0
## 4 26962 201709 23.0 0 0
## 5 14893 201805 31.0 0 0
## 6 14893 202001 31.0 0 0
## Water_prod_vol_m3 Cond_prod_vol_m3 Area_name Formtn_name Prod_year
## 1 91.4 0 DESAN QUATERNARY 2018
## 2 1291.0 0 PEEJAY WEST QUATERNARY 2020
## 3 2528.0 0 PEEJAY WEST QUATERNARY 2019
## 4 1068.0 0 BEAVERTAIL QUATERNARY 2017
## 5 6948.7 0 DESAN QUATERNARY 2018
## 6 109.3 0 DESAN QUATERNARY 2020
## Prod_month Prod_ym
## 1 09 2018-09
## 2 03 2020-03
## 3 01 2019-01
## 4 09 2017-09
## 5 05 2018-05
## 6 01 2020-01
class(wells$Prod_ym)
## [1] "character"
head(wells$Prod_period)
## [1] 201809 202003 201901 201709 201805 202001
prod_days_by_area <- wells %>%
group_by(Area_name) %>%
summarize(
total_prod_days = sum(Prod_days)
)
head(prod_days_by_area)
## # A tibble: 6 x 2
## Area_name total_prod_days
## <chr> <dbl>
## 1 ADSETT 6385.
## 2 AIRPORT 141.
## 3 AITKEN CREEK 4164.
## 4 AITKEN CREEK NORTH 1885.
## 5 ALTARES 14861.
## 6 BEAVERDAM 6439.
prod_days_by_area %>% filter(total_prod_days > 500000) %>%
ggplot(aes(x = Area_name, y = total_prod_days)) +
geom_col() +
ggtitle("Total Production Days for most productive areas", ) +
ylab("Production days") +
xlab("Area")
Histograms are commonly used to visualize the distribution of our data. The bars of histogram correspond to the number of observations within specific bin. Note: it’s recommended to check different bin sizes to find the most optimal bin for the data.
wells %>% filter(Cond_prod_vol_m3 > 0) %>%
ggplot() +
geom_histogram(mapping = aes(Cond_prod_vol_m3))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# let's limit our data to the wells which had production (100-1000). Here, we use default number of bins (30)
wells %>% filter(Cond_prod_vol_m3 > 100, Cond_prod_vol_m3 < 1000) %>%
ggplot() +
geom_histogram(mapping = aes(Cond_prod_vol_m3))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# increase number of bins to 200 to see the difference
wells %>% filter(Cond_prod_vol_m3 > 100, Cond_prod_vol_m3 < 1000) %>%
ggplot() +
geom_histogram(mapping = aes(Cond_prod_vol_m3), bins=200) +
ggtitle("Distribution of condensate production per production period") +
xlab(expression("Condensate production per production period m"^3))
Tables are a nice way to present numerical data and identify the trends. Let’s analyze the water production in different formations.
# Using tables we can see how many wells we have for each formation
wells %>% group_by(Formtn_name) %>%
summarize(
count_wells = n_distinct(Wa_num),#count unique well numbers (multiple prod. periods)
mean_wtr_prod = mean(Water_prod_vol_m3)
) %>% arrange(desc(count_wells))
## # A tibble: 92 x 3
## Formtn_name count_wells mean_wtr_prod
## <chr> <int> <dbl>
## 1 MONTNEY 4745 150.
## 2 JEAN MARIE 1531 3.21
## 3 HALFWAY 659 62.2
## 4 BLUESKY 589 2038.
## 5 CADOMIN 535 10.7
## 6 NOTIKEWIN 406 5.03
## 7 BALDONNEL 356 62.3
## 8 BLUESKY-GETHING-MONTNEY 354 1.34
## 9 GETHING 330 2.33
## 10 BOUNDARY LAKE 278 483.
## # ... with 82 more rows
# we can find which areas / formations are suspected to have more water disposal wells than HC production wells
wells %>% group_by(Area_name) %>%
summarize(
count_wells = n_distinct(Wa_num),
mean_wtr_prod = mean(Water_prod_vol_m3)
) %>% arrange(desc(mean_wtr_prod))
## # A tibble: 169 x 3
## Area_name count_wells mean_wtr_prod
## <chr> <int> <dbl>
## 1 HAY RIVER 211 4481.
## 2 KLUA 1 4328.
## 3 SUNRISE 14 2990.
## 4 TOWER LAKE 7 2458.
## 5 CLARKE LAKE 38 2145.
## 6 LAPP 9 1699.
## 7 GOPHER 1 1527.
## 8 WOODRUSH 8 867.
## 9 TWO RIVERS 9 708.
## 10 LIARD 6 700.
## # ... with 159 more rows
Scatterplots are useful when we want to show the relationship between 2 variables.
We can convert our dataframe to the long format to have all types of hydrocarbon production in one column using gather function. Picture below visualizes this:
# let's look again on the selected columns of original dataframe
wells %>% select(Water_prod_vol_m3, Gas_prod_vol_e3m3, Oil_prod_vol_m3, Cond_prod_vol_m3) %>%
head(3)
## Water_prod_vol_m3 Gas_prod_vol_e3m3 Oil_prod_vol_m3 Cond_prod_vol_m3
## 1 91.4 0 0 0
## 2 1291.0 0 0 0
## 3 2528.0 0 0 0
# we will use gather function to create new column "Prod_type" which will contain the "Prod_volume" value for gas, oil and condensate production. We assign the results to new dataframe.
prod_df <- wells %>%
gather(key = "Prod_type",
value = "Prod_volume",
c(Gas_prod_vol_e3m3, Oil_prod_vol_m3, Cond_prod_vol_m3)) %>%
select(Water_prod_vol_m3, Prod_volume, Prod_year, Prod_ym, Prod_type, Prod_days, Formtn_name)
prod_df %>% select(Water_prod_vol_m3, Prod_type) %>% head(3)
## Water_prod_vol_m3 Prod_type
## 1 91.4 Gas_prod_vol_e3m3
## 2 1291.0 Gas_prod_vol_e3m3
## 3 2528.0 Gas_prod_vol_e3m3
# indeed, we have production type in Prod_type column!
unique(prod_df$Prod_type)
## [1] "Gas_prod_vol_e3m3" "Oil_prod_vol_m3" "Cond_prod_vol_m3"
# let's visualize that quickly
prod_df %>%
filter(Prod_year == 2021) %>%
ggplot(aes(x = Water_prod_vol_m3, y = Prod_volume, colour = Prod_type)) +
geom_point()
# add some axis limits
prod_df %>%
filter(Prod_year == 2021) %>%
ggplot(aes(x = Water_prod_vol_m3, y = Prod_volume, colour = Prod_type)) +
geom_point() +
xlim(0,10000) +
ylim(0,25000)
## Warning: Removed 45 rows containing missing values (geom_point).
# visualize each type of production separately
prod_df %>%
filter(Prod_year == 2021) %>%
ggplot(aes(x = Water_prod_vol_m3, y = Prod_volume, colour = Prod_type)) +
geom_point() +
facet_wrap(~Prod_type)
# create indepentent y axis for each chart, since the production is in different units.
prod_df %>%
filter(Prod_year == 2021) %>%
ggplot(aes(x = Water_prod_vol_m3, y = Prod_volume, colour = Prod_type, alpha = 0.3)) +
geom_point() +
facet_wrap(~Prod_type, scales = "free") # scales free => not fixed limits of yaxis
# I expect that Prod_volume > 0 will indicate the SWD disposal wells, let's remove those from the plot
prod_df %>%
filter(Prod_year == 2021, Prod_volume > 0) %>% # Prod_colume > 0 to remove disposal wells
ggplot(aes(x = Water_prod_vol_m3, y = Prod_volume, colour = Prod_type, alpha = 0.3)) +
geom_point() +
ggtitle("Water production per HC type") +
facet_wrap(~Prod_type, scales = "free") +
xlim(0,5000)
## Warning: Removed 126 rows containing missing values (geom_point).
Bar charts can be created in using two geometries: geom_col - by default height of the bars represent value in the data geom_bar - by default counts number of cases in each group
### Let's start with geom_col to visualize the production per period...
wells %>% subset(Prod_year > 2000) %>%
group_by(Prod_ym) %>%
summarise(
prod_per_period = sum(Cond_prod_vol_m3)
) %>%
ggplot(aes(x = Prod_ym, y = prod_per_period)) +
theme(axis.text.x = element_text(angle = 90)) +
geom_col()
### ...and per year. Which one is more useful / interesting?
wells %>%
subset(Prod_year > 2000) %>%
group_by(Prod_year) %>%
summarise(
prod_per_year = sum(Cond_prod_vol_m3)
) %>%
ggplot(aes(x = Prod_year, y = prod_per_year)) +
geom_col()
class(wells$Prod_period)
## [1] "integer"
Line graphs are useful to visualize the change in time. We can visualize the trends in production for example.
# reminder: available formation names in our dataframe
unique(wells$Formtn_name)[1:20]
## [1] "QUATERNARY" "CRETACEOUS" "CARDIUM SAND"
## [4] "DOE CREEK" "DUNVEGAN" "BASE OF FISH SCALES"
## [7] "SCATTER" "PADDY" "PADDY- CADOTTE"
## [10] "CADOTTE" "SPIRIT RIVER" "NOTIKEWIN"
## [13] "FALHER" "FALHER A" "FALHER B"
## [16] "FALHER C" "FALHER D" "WILRICH"
## [19] "BLUESKY" "BASAL BLUESKY"
# let's visualize only one formation first
formtnName = "BLUESKY"
prod_df %>%
filter(Formtn_name == formtnName) %>%
group_by(Prod_ym, Formtn_name, Prod_type) %>%
summarise(
total_prod = sum(Prod_volume)
) %>%
ungroup() %>%
ggplot(aes(x = Prod_ym, y = total_prod, group=Formtn_name, col=Prod_type)) +
geom_line() +
scale_x_discrete(breaks = c("2016-01", "2017-01","2018-01",
"2019-01","2020-01", "2021-01")) +
facet_grid(Prod_type ~ ., scales = "free") +
ggtitle(paste("Production per year for formation", formtnName))
## `summarise()` has grouped output by 'Prod_ym', 'Formtn_name'. You can override using the `.groups` argument.
# and now production from all formations in one chart
prod_df %>%
group_by(Prod_ym, Prod_type) %>%
summarise(
total_prod = sum(Prod_volume)
) %>%
ungroup() %>%
ggplot(aes(x = Prod_ym, y = total_prod, group=1, col=Prod_type)) +
geom_line() +
scale_x_discrete(breaks = c("2016-01", "2017-01","2018-01",
"2019-01","2020-01", "2021-01")) +
facet_grid(Prod_type ~ ., scales = "free") +
ggtitle("Total production per year")
## `summarise()` has grouped output by 'Prod_ym'. You can override using the `.groups` argument.
We can apply simple trick to order the formations from the highest production to the lowest (using mutate function).
We save most productive formations separately, they will be useful later to plot production in time.
# first let's create dataframe with summed produced values by formation
prod_by_formation <- wells %>%
group_by(Formtn_name) %>%
summarise(
oil_prod_total = sum(Oil_prod_vol_m3),
gas_prod_total = sum(Gas_prod_vol_e3m3),
cond_prod_total = sum(Cond_prod_vol_m3),
water_prod_total = sum(Water_prod_vol_m3)
) %>%
ungroup()
# find 5 most productive gas formation
most_gas <- prod_by_formation %>%
arrange(desc(gas_prod_total)) %>%
head(5)
ggplot(most_gas, aes(x = Formtn_name, y = gas_prod_total)) +
geom_col(fill='#ff6347') # change color to red
# we can use a trick to order the bars according to production value (using mutate function)
most_gas <- most_gas %>%
mutate(Formtn_name=factor(Formtn_name, levels=Formtn_name))
ggplot(most_gas, aes(x = Formtn_name, y = gas_prod_total)) +
geom_col(fill='#ff6347') # change color to red
# same for oil production
most_oil <- prod_by_formation %>%
arrange(desc(oil_prod_total)) %>%
head(5) %>%
mutate(Formtn_name=factor(Formtn_name, levels=Formtn_name))
ggplot(most_oil, aes(x = Formtn_name, y = oil_prod_total)) +
geom_col()
# and for condensate production
most_cond <- prod_by_formation %>%
arrange(desc(cond_prod_total)) %>%
head(5) %>%
mutate(Formtn_name=factor(Formtn_name, levels=Formtn_name))
ggplot(most_gas, aes(x = Formtn_name, y = gas_prod_total)) +
geom_col(fill="#6495ed")
We can also visualize the same set of formations. To define the set, we will take 5 most productive gas, oil and condensate formations.
# define 5 most productive formations
best_formations <- unique(most_oil$Formtn_name,
most_gas$Formtn_name,
most_cond$Formtn_name)
# plot oil production
wells %>%
subset(Formtn_name %in% best_formations) %>%
ggplot(aes(x = Formtn_name, y = Oil_prod_vol_m3, fill=Formtn_name)) +
geom_col() +
ggtitle("Total oil production by formation") +
xlab("Formation") +
ylab("Oil production [m3]")
# plot gas production
prod_by_formation %>%
subset(Formtn_name %in% best_formations) %>%
ggplot(aes(x = Formtn_name, y = gas_prod_total, fill=Formtn_name)) +
geom_col() +
ggtitle("Total gas production by formation") +
xlab("Formation") +
ylab("Gas production [e3m3]")
# plot condensate production
prod_by_formation %>%
subset(Formtn_name %in% best_formations) %>%
ggplot(aes(x = Formtn_name, y = cond_prod_total, fill=Formtn_name)) +
geom_col() +
ggtitle("Total condensate production by formation") +
xlab("Formation") +
ylab("Condensate production [m3]")
# plot water production
prod_by_formation %>%
subset(Formtn_name %in% best_formations) %>%
ggplot(aes(x = Formtn_name, y = water_prod_total, fill=Formtn_name)) +
geom_col() +
ggtitle("Total water production by formation") +
xlab("Formation") +
ylab("Water production [m3]")
{r eval=TRUE, echo=FALSE} # gas_prod <- prod_by_formation[prod_by_formation$gas_prod_total > 0,] # oil_prod <- prod_by_formation[prod_by_formation$oil_prod_total > 0,] # cond_prod <- prod_by_formation[prod_by_formation$cond_prod_total > 0,] #let’s see how facets can increase the visibility of our charts ()
#library(tidyverse)
# here we plot the count of wells per year for all formations
ggplot(data = wells, aes(x = Prod_year)) +
geom_bar()
# filter wells for most productive formations and most recent wells. Add some colors (fill) selected manually (scale_fill_manual)
ggplot(filter(wells, Formtn_name %in% best_formations & Prod_year < 2021)) +
geom_bar(aes(x = Formtn_name, fill=Formtn_name)) +
scale_fill_manual(values = c('#d0d1e6','#a6bddb','#67a9cf','#1c9099','#016c59')) + # taken e.g. from colorbrewer
facet_wrap(~ Prod_year) +
theme(axis.text.x = element_blank(), # remove ticks #cosmetics
axis.ticks.x = element_blank(),
axis.title.x = element_blank())
Making interactive plots in R is extremely easy. First, we need to create standard “static” chart and then feed it to plotly.
library(plotly)
## Warning: package 'plotly' was built under R version 4.0.4
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
head(prod_df)
## Water_prod_vol_m3 Prod_volume Prod_year Prod_ym Prod_type Prod_days
## 1 91.4 0 2018 2018-09 Gas_prod_vol_e3m3 30.0
## 2 1291.0 0 2020 2020-03 Gas_prod_vol_e3m3 30.7
## 3 2528.0 0 2019 2019-01 Gas_prod_vol_e3m3 30.8
## 4 1068.0 0 2017 2017-09 Gas_prod_vol_e3m3 23.0
## 5 6948.7 0 2018 2018-05 Gas_prod_vol_e3m3 31.0
## 6 109.3 0 2020 2020-01 Gas_prod_vol_e3m3 31.0
## Formtn_name
## 1 QUATERNARY
## 2 QUATERNARY
## 3 QUATERNARY
## 4 QUATERNARY
## 5 QUATERNARY
## 6 QUATERNARY
# let's filter the data first and summarize the production
prod_per_period <- prod_df %>%
filter(Formtn_name %in% best_formations) %>%
group_by(Prod_ym, Prod_type, Formtn_name) %>%
summarise(
prod_per_period = sum(Prod_volume)
)
## `summarise()` has grouped output by 'Prod_ym', 'Prod_type'. You can override using the `.groups` argument.
# let's plot it according to the production type
p <- ggplot(prod_per_period, aes(x=Prod_ym, y=prod_per_period, group=Formtn_name, fill=Formtn_name)) +
geom_area(alpha=0.3) +
scale_x_discrete(breaks = c("2016-01", "2017-01","2018-01",
"2019-01","2020-01", "2021-01")) +
facet_grid(Prod_type ~ ., scales = "free")
p <- ggplotly(p)
p
# That's nice but let's add some details, like text when we hover over charts.
p <- ggplot(prod_per_period, aes(x=Prod_ym, y=prod_per_period, group=Prod_type,
text = paste("Formation:", Formtn_name,
"<br>Prod. type:", Prod_type), fill=Prod_type)) +
geom_area(colour="#636363", alpha=0.3) +
scale_x_discrete(breaks = c("2016-01", "2017-01","2018-01",
"2019-01","2020-01", "2021-01")) +
facet_wrap(Formtn_name ~ ., scales = "free") +
ylab("") +
xlab("")
p <- ggplotly(p, tooltip = "text")
p
Let’s move to another dataset provided by BCOGC to access the location of the wells and plot them together with geophysical lines, O&G pools and fields. O&G field => group of O&G pools
Here we will use geom_sf geometry and sf package, commonly used for geospatial data manipulation. We fill use following sf functions: st_as_sf() - convert foreign object to the sf object st_crs() - get coordinate system of the feature st_transform() - transform features (for example change coordinate system) st_union() - merge multiple features into one st_intersects() -
st_convex_hull() - create polygon around the points (minimum bounding area) st_within() - get features within other feature st_crop() - crop the feature
rm(list = ls(all=TRUE)) # clear global environment
path <- file.path("C:","Users","Paulina-laptop","Desktop","R_Programming","RLadies_workshop")
setwd(path)
sprintf("Current directory: %s", getwd())
## [1] "Current directory: C:/Users/Paulina-laptop/Desktop/R_Programming/RLadies_workshop"
library(sf)
## Linking to GEOS 3.9.0, GDAL 3.2.1, PROJ 7.2.1
# load dplyr and ggplot2 in case this is the only chunk you want to run
library(dplyr)
library(ggplot2)
wells_coords <- read.csv(unzip("BCOGC_data/drill_csv.zip", "wells.csv"), skip = 1)
head(wells_coords)#bigger file
## Well.Surf.Loc Well.Name
## 1 D- 080-H/092-G-03 ROYAL CITY NO. 1 D- 080-H/092-G-03
## 2 B- 050-D/082-G-01 CANADIAN KOOTENAY NO. 1 B- 050-D/082-G-01
## 3 D- 055-A/082-G-02 BORDER OILS NO. 1 D- 055-A/082-G-02
## 4 13-11-081-17 CANLIN PINGEL 13-11-081-17
## 5 A- 065-E/093-I-16 CVE ENERGY ET AL LONE MOUNTAIN NO. 1 A- 065-E/093-I-16
## 6 C- 048-D/103-G-05 STRATHCONA QUEEN CHARLOTTE NO.1 C- 048-D/103-G-05
## WA.Num Surf.Nad27.Lat Surf.Nad27.Long Surf.Nad83.Lat Surf.Nad83.Long
## 1 1 49084860 123064913 49085314 123065320
## 2 2 49020730 114294872 49035335 114497851
## 3 3 49025250 114331128 49047892 114554121
## 4 4 NA NA 56004511 120331605
## 5 5 54530889 120254600 54530863 120255126
## 6 6 53172250 131590333 53171412 131575692
## Surf.UTM.Zone.Num Surf.UTM83.Northng Surf.UTM83.Eastng Surf.North Surf.East
## 1 10 5443925 491629.8 -192.0 -71.3
## 2 11 5434400 682875.8 NA NA
## 3 11 5435662 678718.6 NA NA
## 4 10 6210173 652456.4 -201.2 221.3
## 5 10 6085099 664789.3 274.9 -285.3
## 6 9 5908329 302312.1 -463.9 -107.9
## Surf.Owner Ground.Elevtn Directional.Flag Surf.Ref.Corner Surf.Ref.Unit
## 1 CROWN 1.5 N NE 80
## 2 CROWN NA N NW 50
## 3 CROWN NA N SW 55
## 4 CROWN 758.6 N NW NA
## 5 CROWN 971.6 N SE 65
## 6 CROWN 8.3 N NE 48
## Surf.Ref.Block Surf.Ref.Map Surf.Ref.Lsd Surf.Ref.Sect Surf.Ref.Twp
## 1 H 092-G-03 NA NA NA
## 2 D 082-G-01 NA NA NA
## 3 A 082-G-02 NA NA NA
## 4 13 11 81
## 5 E 093-I-16 NA NA NA
## 6 D 103-G-05 NA NA NA
## Surf.Ref.Range Surf.DLS.Exception Surf.Lsd Surf.Sect Surf.Twp Surf.Range
## 1 NA NA NA NA NA
## 2 NA NA NA NA NA
## 3 NA NA NA NA NA
## 4 17 13 11 81 17
## 5 NA NA NA NA NA
## 6 NA NA NA NA NA
## Surf.Qtr.Unit Surf.NTS.Exception Surf.Unit Surf.Block Surf.Map Oper.Id
## 1 D 80 H 092-G-03 0
## 2 B 50 D 082-G-01 0
## 3 D 55 A 082-G-02 0
## 4 NA 1018
## 5 A 65 E 093-I-16 1016
## 6 C 48 D 103-G-05 11075
## Oper.Abbrev Oper.Abbrev2 Optnl.Unit Well.Area.Name Well.Name.Date
## 1 ROYAL CITY NO. 1 19480610
## 2 CANADIAN KOOTENAY NO. 1 19490903
## 3 BORDER OILS NO. 1 19480704
## 4 CANLIN PINGEL 20171206
## 5 CVE ENERGY ET AL LONE MOUNTAIN NO. 1 20180607
## 6 STRATHCONA QUEEN CHARLOTTE NO.1 20200901
## Special.Well.Class.Code Test.Hole
## 1 NO N
## 2 NO N
## 3 N
## 4 NO N
## 5 NO N
## 6 NO N
names(wells_coords)
## [1] "Well.Surf.Loc" "Well.Name"
## [3] "WA.Num" "Surf.Nad27.Lat"
## [5] "Surf.Nad27.Long" "Surf.Nad83.Lat"
## [7] "Surf.Nad83.Long" "Surf.UTM.Zone.Num"
## [9] "Surf.UTM83.Northng" "Surf.UTM83.Eastng"
## [11] "Surf.North" "Surf.East"
## [13] "Surf.Owner" "Ground.Elevtn"
## [15] "Directional.Flag" "Surf.Ref.Corner"
## [17] "Surf.Ref.Unit" "Surf.Ref.Block"
## [19] "Surf.Ref.Map" "Surf.Ref.Lsd"
## [21] "Surf.Ref.Sect" "Surf.Ref.Twp"
## [23] "Surf.Ref.Range" "Surf.DLS.Exception"
## [25] "Surf.Lsd" "Surf.Sect"
## [27] "Surf.Twp" "Surf.Range"
## [29] "Surf.Qtr.Unit" "Surf.NTS.Exception"
## [31] "Surf.Unit" "Surf.Block"
## [33] "Surf.Map" "Oper.Id"
## [35] "Oper.Abbrev" "Oper.Abbrev2"
## [37] "Optnl.Unit" "Well.Area.Name"
## [39] "Well.Name.Date" "Special.Well.Class.Code"
## [41] "Test.Hole"
# subset dataframe
wells_coords <- wells_coords %>% select(Surf.UTM83.Northng, Surf.UTM83.Eastng, Surf.UTM.Zone.Num, Well.Area.Name, Oper.Abbrev, Surf.Owner)
# select only wells from the UNT10 and UTM11 zones
wells_coords <- wells_coords %>% filter(complete.cases(.), Surf.UTM.Zone.Num %in% c(10,11))
unique(wells_coords$Surf.UTM.Zone.Num)
## [1] 10 11
# create 2 geodataframes with crs values according to UTM zone
wells_pts_1 <- wells_coords %>%
filter(Surf.UTM.Zone.Num == 10) %>%
st_as_sf(coords = c("Surf.UTM83.Eastng", "Surf.UTM83.Northng"), crs = 26910) %>%
st_transform(4326) # want to use latlon coords
wells_pts_2 <- wells_coords %>%
filter(Surf.UTM.Zone.Num == 11) %>%
st_as_sf(coords = c("Surf.UTM83.Eastng", "Surf.UTM83.Northng"), crs = 26910) %>%
st_transform(4326) # want to use latlon coords
# bind dataframe rows vertically and check the coordinate system
wells_pts <- rbind(wells_pts_1, wells_pts_2)
st_crs(wells_pts)
## Coordinate Reference System:
## User input: EPSG:4326
## wkt:
## GEOGCRS["WGS 84",
## DATUM["World Geodetic System 1984",
## ELLIPSOID["WGS 84",6378137,298.257223563,
## LENGTHUNIT["metre",1]]],
## PRIMEM["Greenwich",0,
## ANGLEUNIT["degree",0.0174532925199433]],
## CS[ellipsoidal,2],
## AXIS["geodetic latitude (Lat)",north,
## ORDER[1],
## ANGLEUNIT["degree",0.0174532925199433]],
## AXIS["geodetic longitude (Lon)",east,
## ORDER[2],
## ANGLEUNIT["degree",0.0174532925199433]],
## USAGE[
## SCOPE["Horizontal component of 3D system."],
## AREA["World."],
## BBOX[-90,-180,90,180]],
## ID["EPSG",4326]]
# now let's load geoophysical lines
geoph_lines <- st_read("BCOGC_data/shapefiles/PASR_GEOPHYSICAL_LN_subset.shp")
## Reading layer `PASR_GEOPHYSICAL_LN_subset' from data source `C:\Users\Paulina-laptop\Desktop\R_programming\RLadies_workshop\BCOGC_data\shapefiles\PASR_GEOPHYSICAL_LN_subset.shp' using driver `ESRI Shapefile'
## Simple feature collection with 49710 features and 19 fields
## Geometry type: MULTILINESTRING
## Dimension: XY
## Bounding box: xmin: 1266576 ymin: 1173555 xmax: 1374678 ymax: 1265680
## Projected CRS: NAD83 / BC Albers
# we will focus only on the lines within Dawson Area
geoph_lines_dawson <- filter(geoph_lines, PROG_NAME %in% c("DAWSON 3D","DAWSON 3D EXTENSION"))
# let's set the same crs as the wells
geoph_lines_dawson <- st_transform(geoph_lines_dawson, st_crs(wells_pts))
# merge the lines and create the polygon to filter the wells
dawson_area <- st_union(geoph_lines_dawson) %>%
st_convex_hull()
## although coordinates are longitude/latitude, st_union assumes that they are planar
# filter the wells
wells_dawson <- wells_pts %>%
filter(st_within(x = ., y = dawson_area, sparse = FALSE))
## although coordinates are longitude/latitude, st_within assumes that they are planar
# plot together
ggplot() +
geom_sf(data = wells_dawson, size=1, alpha=0.2, colour='blue') +
geom_sf(data = geoph_lines_dawson, size=0.1)
names(wells_dawson)
## [1] "Surf.UTM.Zone.Num" "Well.Area.Name" "Oper.Abbrev"
## [4] "Surf.Owner" "geometry"
# let's add O&G pools, transform crs and find the polls intersecting Dawson area
pools <- st_read("BCOGC_data/shapefiles/POOL_CONTOURS_LN.shp")
## Reading layer `POOL_CONTOURS_LN' from data source `C:\Users\Paulina-laptop\Desktop\R_programming\RLadies_workshop\BCOGC_data\shapefiles\POOL_CONTOURS_LN.shp' using driver `ESRI Shapefile'
## Simple feature collection with 7963 features and 12 fields
## Geometry type: MULTILINESTRING
## Dimension: XY
## Bounding box: xmin: 1091006 ymin: 1070402 xmax: 1386930 ymax: 1679933
## Projected CRS: NAD83 / BC Albers
# get the production pools intersecting the Dawson area only
pools <- st_transform(pools, st_crs(wells_dawson))
pools_dawson <- pools %>%
filter(st_intersects(x = ., y = dawson_area, sparse = FALSE))
## although coordinates are longitude/latitude, st_intersects assumes that they are planar
# plot it
ggplot() +
geom_sf(data = wells_dawson, alpha=0.2) +
geom_sf(data = geoph_lines_dawson, alpha=0.2) +
geom_sf(data = pools_dawson, aes(colour=FLUID_TYPE)) +
scale_color_discrete(name = "Producing pools") +
xlab("Longitude") +
ylab("Latitude")
# Let's add one more information:O&G fields. Transform to the crs of the wells
fields <- st_read("BCOGC_data/shapefiles/FIELDS_PY.shp")
## Reading layer `FIELDS_PY' from data source `C:\Users\Paulina-laptop\Desktop\R_programming\RLadies_workshop\BCOGC_data\shapefiles\FIELDS_PY.shp' using driver `ESRI Shapefile'
## Simple feature collection with 570 features and 5 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: 1079392 ymin: 1066850 xmax: 1387498 ymax: 1680540
## Projected CRS: NAD83 / BC Albers
ggplot() +
geom_sf(data = fields)
fields <- st_transform(fields, st_crs(wells_dawson))
# let's focus on the fields in the smaller area (crop the layer)
fields <- st_crop(fields, xmin = -121, xmax = -120, ymin = 55.7, ymax = 56)
## although coordinates are longitude/latitude, st_intersection assumes that they are planar
unique(fields$FLDRNM)
## [1] "MICA" "PARKLAND" "DOE" "DAWSON CREEK"
## [5] "GROUNDBIRCH" "SATURN" "SUNSET PRAIRIE" "BRIAR RIDGE"
## [9] "BRASSEY" "TOWER LAKE" "SUNDOWN" "SUNRISE"
# let's visualize only Dawson field
field_dawson <- fields %>% filter(FLDRNM == "DAWSON CREEK")
# let's plot it together
ggplot() +
geom_sf(data = wells_dawson, alpha=0.2) +
geom_sf(data = geoph_lines_dawson, alpha=0.2) +
geom_sf(data = pools, aes(colour=FLUID_TYPE)) +
geom_sf(data = field_dawson, fill=NA, linetype="dashed", colour="blue", size=0.2) +
scale_color_discrete(name = "Producing pools") +
coord_sf(crs = 4326, xlim = c(-120.5,-120), ylim = c(55.7, 56)) +
xlab("Longitude") +
ylab("Latitude")